Today’s Goal
if (!require(ggwaffle)) {
remotes::install_github("liamgilbey/ggwaffle")
}
pacman::p_load(tidyverse,
gapminder,
gcookbook,
ggwaffle,
ggcharts,
ggridges,
GGally,
emojifont)Source: http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/#Helper%20functions
summarySE <- function(data = NULL,
measurevar,
groupvars = NULL,
na.rm = FALSE,
conf.interval = .95,
.drop = TRUE) {
library(plyr)
# New version of length which can handle NA's: if na.rm==T, don't count them
length2 <- function (x, na.rm = FALSE) {
if (na.rm) sum(!is.na(x))
else length(x)
}
# This does the summary. For each group's data frame, return a vector with
# N, mean, and sd
datac <- ddply(data, groupvars, .drop = .drop,
.fun = function(xx, col) {
c(N = length2(xx[[col]], na.rm = na.rm),
mean = mean (xx[[col]], na.rm = na.rm),
sd = sd (xx[[col]], na.rm = na.rm)
)
},
measurevar
)
# Rename the "mean" column
datac <- rename(datac, c("mean" = measurevar))
datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
ciMult <- qt(conf.interval/2 + .5, datac$N-1)
datac$ci <- datac$se * ciMult
return(datac)
}A simple bar chart.
data(mpg)
bar1 <- ggplot(mpg, aes(x = manufacturer)) +
geom_bar()
plot(bar1)Rotate the plot.
bar2 <- ggplot(mpg, aes(y = manufacturer)) +
geom_bar()
plot(bar2)Alternatively, you can do:
bar2b <- bar1 +
coord_flip()
plot(bar2b)Reverse the order of the category.
bar3 <- ggplot(mpg, aes(y = reorder(manufacturer, desc(manufacturer)))) +
geom_bar() +
labs(y = "Manufacturer")
plot(bar3)Proportion
bar4 <- ggplot(mpg,
aes(x = (after_stat(count) / sum(after_stat(count))),
y = reorder(manufacturer, desc(manufacturer)))) +
geom_bar() +
labs(y = "Manufacturer",
x = "Proportion")
plot(bar4)data(Titanic)
Titanic <- as_tibble(Titanic)
pbar1 <- ggplot(Titanic, aes(x = Class, y = n, fill = Sex)) +
geom_bar(stat = "identity",
position = "dodge") +
scale_fill_brewer(palette = "Set2")
plot(pbar1)pbar2 <- ggplot(Titanic, aes(x = Sex, y = n, fill = Class)) +
geom_bar(stat = "identity",
position = "dodge") +
scale_fill_brewer(palette = "Accent")
plot(pbar2) sbar1 <- ggplot(Titanic, aes(x = Class, y = n, fill = Sex)) +
geom_bar(stat = "identity",
position = "stack") +
scale_fill_brewer(palette = "Set2")
plot(sbar1)sbar2 <- ggplot(Titanic, aes(x = Sex, y = n, fill = Class)) +
geom_bar(stat = "identity",
position = "stack") +
scale_fill_brewer(palette = "Accent")
plot(sbar2) sbar3 <- ggplot(Titanic, aes(x = Sex, y = n, fill = Class)) +
geom_bar(stat = "identity",
position = "fill") +
scale_fill_brewer(palette = "Accent")
plot(sbar3) gapminder_sub <- gapminder |>
filter(year %in% c(1997, 2007)) |>
filter(continent == "Asia") |>
select(country, year, gdpPercap) |>
pivot_wider(id_cols = country,
names_from = year,
names_prefix = "year_",
values_from = gdpPercap) |>
mutate(gdp_change = (year_2007 - year_1997) / year_1997)
dbar1 <- ggplot(gapminder_sub, aes(y = country, x = gdp_change)) +
geom_bar(stat = "identity") +
labs(x = "GDP change", y = "Country")
plot(dbar1)dbar2 <- ggplot(gapminder_sub, aes(y = country, x = gdp_change,
fill = gdp_change > 0)) +
geom_bar(stat = "identity") +
labs(x = "GDP change", y = "Country") +
scale_fill_brewer(palette = "Set1") +
theme(legend.position = "none")
plot(dbar2)dbar3 <- ggplot(gapminder_sub, aes(y = reorder(country, gdp_change),
x = gdp_change,
fill = gdp_change > 0)) +
geom_bar(stat = "identity") +
labs(x = "GDP change", y = "Country") +
scale_fill_brewer(palette = "Set1") +
theme(legend.position = "none")
plot(dbar3)data(starwars)
starwars_chars <- starwars |>
filter(!is.na(homeworld)) |>
group_by(homeworld, gender) |>
summarize(average_height = mean(height, na.rm = TRUE),
.groups = "drop") |>
group_by(homeworld) |>
filter(n() == 2) |>
mutate(average_height = ifelse(gender == "feminine",
average_height,
-1 * average_height))
dbar4 <- starwars_chars |>
ggplot(aes(y = homeworld, x = average_height, fill = gender)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("dodgerblue", "tomato")) +
scale_x_continuous(breaks = seq(-200, 200, by = 100),
labels = c(200, 100, 0, 100, 200)) +
labs(x = "Average height",
y = "Homeworld")
plot(dbar4)tophit <- tophitters2001[1:25, ] # Take the top 25 from the tophitters data set
dplot1 <- ggplot(tophit, aes(x = avg, y = name)) +
geom_point()
plot(dplot1)dplot2 <- ggplot(tophit, aes(x = avg, y = reorder(name, avg))) +
geom_point(size = 3) +
theme_bw() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_line(color = "grey60",
linetype = "dashed")) +
labs(x = "Average", y = "Name")
plot(dplot2)dplot3 <- ggplot(tophit, aes(x = reorder(name, avg), y = avg)) +
geom_point(size = 3) + # Use a larger dot
theme_bw() +
theme(panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(color = "grey60",
linetype = "dashed"),
axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(x = "Name", y = "Average")
plot(dplot3)dplot4 <- ggplot(tophit, aes(x = avg, y = reorder(name, avg))) +
geom_segment(aes(yend = name), xend = 0, color = "grey50") +
geom_point(size = 3, aes(colour = lg)) +
scale_color_brewer(palette = "Set1",
name = "League",
limits = c("NL", "AL")) +
theme_bw() +
theme(panel.grid.major.y = element_blank(),
legend.position = "bottom") +
labs(x = "Average", y = "Name")
plot(dplot4)dplot5 <- ggplot(tophit, aes(x = avg, y = reorder(name, avg))) +
geom_segment(aes(yend = name), xend = 0, color = "grey50") +
geom_point(size = 3, aes(colour = lg)) +
scale_colour_brewer(palette = "Set1") +
theme_bw() +
theme(panel.grid.major.y = element_blank(),
legend.position = "none") +
facet_grid(rows = vars(lg),
scales = "free_y",
space = "free_y") +
labs(x = "Average", y = "Name")
plot(dplot5)download.file(
url = "https://github.com/yukiyanai/quant-methods-R/raw/master/data_fixed/hr96-17.csv",
dest = "data/hr96-17.csv")Or download the data set manually.
HR09 <- read_csv("data/hr96-17.csv") |>
filter(year == 2009) |>
mutate(wl = factor(wl,
levels = 0 : 2,
labels = c('lost', 'won', 'zombie')),
status = factor(status,
levels = 0 : 2,
labels = c('challenger', 'incumbent', 'ex-member')))
# Cross Table
(tbl_st_wl2 <- with(HR09, table(wl, status)))## status
## wl challenger incumbent ex-member
## lost 558 170 14
## won 80 175 45
## zombie 43 52 2
# total num. of candidates by status
tbl_st <- with(HR09, table(status))
# turn the table into a matrix
tbl_st_wl2 <- as.matrix(tbl_st_wl2[1:3, 1:3])
tbl_st <- as.matrix(tbl_st[1:3])
# Create a data frame
# make variables of the share of each status and
# of the share of each result for each status
df <- data.frame(status = levels(HR09$status),
status_pct = 100 * as.vector(tbl_st / sum(tbl_st)),
win = 100 * as.vector(tbl_st_wl2[2,] / tbl_st),
zombie = 100 * as.vector(tbl_st_wl2[3,] / tbl_st),
lose = 100 * as.vector(tbl_st_wl2[1,] / tbl_st))
# Calculate the boundaries of categories on the horizontal axis
df <- df |>
mutate(xmax = cumsum(status_pct),
xmin = xmax - status_pct)
## Remove the variable status_pct because we won't use it
df$status_pct <- NULL
# Reshape the data frame
df_long <- df %>% gather(key = result, value = voteshare, win:lose)
##################################################
## COMPARE df_long with df
## to understand what gather() has accomplished
## df
## df_long
###################################################
# Calculate the boundaries of categories on the vertical axis
df_long_1 <- df_long %>%
group_by(status) %>%
mutate(ymax = cumsum(voteshare),
ymin = ymax - voteshare)
# Detemine the locations to put texts
df_long_1 <- df_long_1 %>%
mutate(xtext = xmin + (xmax - xmin) / 2,
ytext = ymin + (ymax - ymin) / 2)
# Make a mosaic plot
mosaic <- ggplot(df_long_1, aes(ymin = ymin, ymax = ymax,
xmin = xmin, xmax = xmax,
fill = result))
mosaic <- mosaic +
geom_rect(color = I('grey')) +
geom_text(aes(x = xtext, y = ytext, label = paste0(round(voteshare), '%'))) +
geom_text(aes(x = xtext, y = 103, label = status)) +
labs(x = '', y = '')
plot(mosaic)waffle_data <- waffle_iron(mpg, aes_d(group = class))
waffle1 <- ggplot(waffle_data, aes(x, y, fill = group)) +
geom_waffle()
plot(waffle1)waffle2 <- ggplot(waffle_data, aes(x, y, fill = group)) +
geom_waffle() +
coord_equal() +
scale_fill_waffle() +
theme_waffle()
plot(waffle2)waffle3 <- ggplot(waffle_data, aes(x, y, color = group)) +
geom_waffle(tile_shape = "circle", size = 4) +
coord_equal() +
scale_colour_waffle() +
theme_waffle()
plot(waffle3)waffle_data$label = fontawesome("fa-twitter")
waffle4 <- ggplot(waffle_data, aes(x, y, color = group)) +
geom_text(aes(label = label),
family = "fontawesome-webfont",
size = 3) +
scale_color_brewer(palette = "Set1") +
coord_equal() +
theme_waffle()
plot(waffle4)hist1 <- ggplot(mpg, aes(x = cty)) +
geom_histogram(color = "black") +
labs(x = "City miles per gallon")
plot(hist1)Change the bin width.
hist2 <- ggplot(mpg, aes(x = cty)) +
geom_histogram(color = "black",
binwidth = 5) +
labs(x = "City miles per gallon")
plot(hist2)Shift bin boundaries.
hist3 <- ggplot(mpg, aes(x = cty)) +
geom_histogram(color = "black",
binwidth = 2,
boundary = 0) +
labs(x = "City miles per gallon")
plot(hist3)You can use ggcharts::pyrmid_chart()
data(popch)
pyramid1 <- pyramid_chart(popch, x = age, y = pop, group = sex,
bar_colors = c("darkorange", "darkblue"),
xlab = "Population") ## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
plot(pyramid1)Standard errors.
data(ToothGrowth)
tgc <- summarySE(ToothGrowth,
measurevar = "len",
groupvars = c("supp", "dose"))
tgc2 <- tgc |>
mutate(dose = factor(dose))
eb1 <- ggplot(tgc2, aes(x = dose, y = len, fill = supp)) +
geom_bar(position = "dodge",
stat = "identity") +
geom_errorbar(aes(ymin = len - se, ymax = len + se),
width = .2,
position = position_dodge(.9)) +
scale_fill_brewer(palette = "Dark2",
name = "Supplement type") +
labs(x = "Dose (mg / day)",
y = "Tooth length")
plot(eb1)95% confidence interval.
eb2 <- ggplot(tgc2, aes(x = dose, y = len, fill = supp)) +
geom_bar(position = "dodge",
stat = "identity") +
geom_errorbar(aes(ymin = len - ci, ymax = len + ci),
width = .2,
position = position_dodge(.9)) +
scale_fill_brewer(palette = "Dark2",
name = "Supplement type") +
labs(x = "Dose (mg / day)",
y = "Tooth length")
plot(eb2)95% confidence interval.
ci1 <- ggplot(mtcars, aes(x = mpg, y = wt)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE) +
labs(x = "Miles / US gallon",
y = "Weight (1000 lbs)")
plot(ci1)99.9% confidence interval.
ci2 <- ggplot(mtcars, aes(x = mpg, y = wt)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE, level = 0.999) +
labs(x = "Miles / US gallon",
y = "Weight (1000 lbs)")
plot(ci2)Loess smoother with 87% confidence interval.
ci3 <- ggplot(mtcars, aes(x = mpg, y = wt)) +
geom_point() +
geom_smooth(method = "loess", se = TRUE, level = 0.87) +
labs(x = "Miles / US gallon",
y = "Weight (1000 lbs)")
plot(ci3)p <- ggplot(mpg, aes(x = class, y = hwy))
box1 <- p +
geom_boxplot()
plot(box1)Rotate:
box2 <- ggplot(mpg, aes(y = class, x = hwy)) +
geom_boxplot()
plot(box2)Or
plot(box1 + coord_flip())Width proportion to the square-roots fo the number of observations of each group.
box3 <- p +
geom_boxplot(varwidth = TRUE)
plot(box3)Highlight (potential) outliers.
box4 <- p +
geom_boxplot(outlier.color = "red",
outlier.shape = 1)
plot(box4)With original data points with jitter.
box5 <- p +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.2)
plot(box5)box6 <- p +
geom_boxplot(aes(fill = drv))
plot(box6)violin1 <- p +
geom_violin()
plot(violin1)Rotate:
violin2 <- ggplot(mpg, aes(y = class, x = hwy)) +
geom_violin()
plot(violin2)Or
plot(violin1 + coord_flip())With original data points with jitter.
violin3 <- p +
geom_violin() +
geom_jitter(width = 0.2)
plot(violin3)Violin chart with box-and-whisker plots
violin4 <- p +
geom_violin() +
geom_boxplot(fill = "gray",
width = 0.2)
plot(violin4)ridge1 <- ggplot(iris, aes(x = Sepal.Length, y = Species)) +
geom_density_ridges()
plot(ridge1)Closed polygon
ridge2 <- ggplot(iris, aes(x = Sepal.Length, y = Species)) +
geom_density_ridges2()
plot(ridge2)Control the overlaps: “scale = 1” touches exactly
ridge3 <- ggplot(iris, aes(x = Sepal.Length, y = Species)) +
geom_density_ridges(scale = 1)
plot(ridge3)Control the overlaps: “scale < 1” makes no overlap
ridge4 <- ggplot(iris, aes(x = Sepal.Length, y = Species)) +
geom_density_ridges(scale = 0.9)
plot(ridge4)Control the overlaps: “scale > 1” overlaps
ridge5 <- ggplot(iris, aes(x = Sepal.Length, y = Species)) +
geom_density_ridges(scale = 3)
plot(ridge5)ridge6 <- ggplot(lincoln_weather, aes(x = `Mean Temperature [F]`,
y = Month,
fill = stat(x))) +
geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
scale_fill_viridis_c(name = "Temp. [F]", option = "C") +
labs(title = "Temperatures in Lincoln NE in 2016")
plot(ridge6)ridge7 <- ggplot(iris, aes(x = Sepal.Length,
y = Species,
fill = factor(stat(quantile))),
alpha = 1/5) +
stat_density_ridges(
geom = "density_ridges_gradient",
calc_ecdf = TRUE,
quantiles = c(0.025, 0.975)
) +
scale_fill_manual(
name = "Probability",
values = c("#FF0000A0", "#A0A0A0A0", "#0000FFA0"),
labels = c("(0, 0.025]", "(0.025, 0.975]", "(0.975, 1]")
)
plot(ridge7)You can use GGally::ggcoef().
fit <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
data = iris)
coef1 <- ggcoef(fit)
plot(coef1)Exclude the intercept.
coef2 <- ggcoef(fit,
exclude_intercept = TRUE)
plot(coef2)Customize: 99% confidence interval instead of 95%.
coef3 <- ggcoef(fit,
exclude_intercept = TRUE,
conf.level = 0.99,
color = "royalblue",
size = 5)
plot(coef3)Will learn in Lectures 11 and 12.